{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "from tqdm import tqdm\n",
    "\n",
    "# Transform dataset to a Axoltol supported format\n",
    "def convert_to_sharegpt(json_file, output_file, id_prefix, start_id=0):\n",
    "    with open(json_file, 'r') as file:\n",
    "        data = json.load(file)\n",
    "\n",
    "    with open(output_file, 'w') as file:\n",
    "        for entry in data:\n",
    "            conversation_id = f\"{id_prefix}_{entry['id']+start_id}\"\n",
    "            instruction = entry['instruction']\n",
    "            response = entry['response']\n",
    "            conversations = [\n",
    "                {\"from\": \"human\", \"value\": entry['instruction']},\n",
    "                {\"from\": \"gpt\", \"value\": entry['response']}\n",
    "            ]\n",
    "            gen_input_configs = entry['gen_input_configs']\n",
    "            gen_input_configs['pre_query_template'] = entry['pre_query_template']\n",
    "            intent = entry['intent']\n",
    "            knowledge = entry['knowledge']\n",
    "            difficulty = entry['difficulty'] if entry['difficulty'] in ['very easy', 'easy', 'medium', 'hard', 'very hard'] else None\n",
    "            difficulty_generator = entry['difficulty_generator']\n",
    "            input_quality = entry['input_quality'] if entry['input_quality'] in ['very poor', 'poor', 'average', 'good', 'excellent'] else None\n",
    "            quality_explanation = entry['quality_explanation']\n",
    "            quality_generator = entry['quality_generator']\n",
    "            task_category = entry['task_category'] if entry['task_category'] in [\"Information seeking\", \"Reasoning\", \"Planning\", \"Editing\", \"Coding & Debugging\", \"Math\", \"Role playing\", \"Data analysis\", \"Creative writing\", \"Advice seeking\", \"Brainstorming\", \"Other\"] else None\n",
    "            other_task_category = entry['other_task_category']\n",
    "            task_category_generator = entry['task_category_generator']\n",
    "            llama_guard_2 = entry['llama_guard_2']\n",
    "            instruct_reward = entry['instruct_reward']\n",
    "            reward_model = entry['reward_model']\n",
    "            language = entry['language']\n",
    "\n",
    "            if entry['gen_input_configs']['input_generator'] != entry['gen_response_configs']['output_generator']:\n",
    "                raise ValueError(\"Input and output generators must be the same\")\n",
    "            \n",
    "            if id_prefix not in entry['gen_input_configs']['input_generator']:\n",
    "                raise ValueError(f\"Input generator must contain {id_prefix}\")\n",
    "\n",
    "            sharegpt_entry = {\n",
    "                \"conversation_id\": conversation_id,\n",
    "                \"instruction\": instruction,\n",
    "                \"response\": response,\n",
    "                \"conversations\": conversations,\n",
    "                \"gen_input_configs\": gen_input_configs,\n",
    "                \"intent\": intent,\n",
    "                \"knowledge\": knowledge,\n",
    "                \"difficulty\": difficulty,\n",
    "                \"difficulty_generator\": difficulty_generator,\n",
    "                \"input_quality\": input_quality,\n",
    "                \"quality_explanation\": quality_explanation,\n",
    "                \"quality_generator\": quality_generator,\n",
    "                \"task_category\": task_category,\n",
    "                \"other_task_category\": other_task_category,\n",
    "                \"task_category_generator\": task_category_generator,\n",
    "                \"llama_guard_2\": llama_guard_2,\n",
    "                \"instruct_reward\": instruct_reward,\n",
    "                \"reward_model\": reward_model,\n",
    "                \"language\": language\n",
    "            }\n",
    "\n",
    "            file.write(json.dumps(sharegpt_entry) + '\\n')\n",
    "    \n",
    "    print(f\"Converted {len(data)} entries to {output_file}\")\n",
    "    return len(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 10%|█         | 1/10 [00:06<00:55,  6.11s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Converted 100000 entries to Qwen2-72B-Instruct_sharegpt_shard0.jsonl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 20%|██        | 2/10 [00:13<00:53,  6.71s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Converted 100000 entries to Qwen2-72B-Instruct_sharegpt_shard1.jsonl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 30%|███       | 3/10 [00:19<00:45,  6.52s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Converted 100000 entries to Qwen2-72B-Instruct_sharegpt_shard2.jsonl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 40%|████      | 4/10 [00:25<00:38,  6.34s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Converted 100000 entries to Qwen2-72B-Instruct_sharegpt_shard3.jsonl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 50%|█████     | 5/10 [00:33<00:33,  6.74s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Converted 100000 entries to Qwen2-72B-Instruct_sharegpt_shard4.jsonl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 60%|██████    | 6/10 [00:39<00:26,  6.58s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Converted 100000 entries to Qwen2-72B-Instruct_sharegpt_shard5.jsonl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 70%|███████   | 7/10 [00:46<00:19,  6.63s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Converted 100000 entries to Qwen2-72B-Instruct_sharegpt_shard6.jsonl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 80%|████████  | 8/10 [00:52<00:13,  6.63s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Converted 100000 entries to Qwen2-72B-Instruct_sharegpt_shard7.jsonl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 90%|█████████ | 9/10 [00:59<00:06,  6.78s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Converted 100000 entries to Qwen2-72B-Instruct_sharegpt_shard8.jsonl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 10/10 [01:07<00:00,  6.72s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Converted 100000 entries to Qwen2-72B-Instruct_sharegpt_shard9.jsonl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "100%|██████████| 10/10 [00:09<00:00,  1.03it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Concatenated 10 files to Qwen2-72B-Instruct_sharegpt.jsonl\n"
     ]
    }
   ],
   "source": [
    "input_files = [\n",
    "    \"Qwen2-72B-Instruct_topp1_temp1_1718659695/Magpie_Qwen2-72B-Instruct_100000_1718659695_ins_res_difficulty_quality_category_safety_reward_language.json\",\n",
    "    \"Qwen2-72B-Instruct_topp1_temp1_1718693980/Magpie_Qwen2-72B-Instruct_100000_1718693980_ins_res_difficulty_quality_category_safety_reward_language.json\",\n",
    "    \"Qwen2-72B-Instruct_topp1_temp1_1718702029/Magpie_Qwen2-72B-Instruct_100000_1718702029_ins_res_difficulty_quality_category_safety_reward_language.json\",\n",
    "    \"Qwen2-72B-Instruct_topp1_temp1.1_1718775995/Magpie_Qwen2-72B-Instruct_100000_1718775995_ins_res_difficulty_quality_category_safety_reward_language.json\",\n",
    "    \"Qwen2-72B-Instruct_topp1_temp1.1_1718776098/Magpie_Qwen2-72B-Instruct_100000_1718776098_ins_res_difficulty_quality_category_safety_reward_language.json\",\n",
    "    \"Qwen2-72B-Instruct_topp1_temp1.1_1718818690/Magpie_Qwen2-72B-Instruct_100000_1718818690_ins_res_difficulty_quality_category_safety_reward_language.json\",\n",
    "    \"Qwen2-72B-Instruct_topp1_temp1.1_1718868461/Magpie_Qwen2-72B-Instruct_100000_1718868461_ins_res_difficulty_quality_category_safety_reward_language.json\",\n",
    "    \"Qwen2-72B-Instruct_topp0.99_temp1.2_1718868635/Magpie_Qwen2-72B-Instruct_100000_1718868635_ins_res_difficulty_quality_category_safety_reward_language.json\",\n",
    "    \"Qwen2-72B-Instruct_topp0.99_temp1.2_1718897681/Magpie_Qwen2-72B-Instruct_100000_1718897681_ins_res_difficulty_quality_category_safety_reward_language.json\",\n",
    "    \"Qwen2-72B-Instruct_topp0.99_temp1.2_1718950026/Magpie_Qwen2-72B-Instruct_100000_1718950026_ins_res_difficulty_quality_category_safety_reward_language.json\",\n",
    "]\n",
    "\n",
    "# Convert each file to Axolotl format\n",
    "idx = 0\n",
    "id_prefix = \"Qwen2-72B-Instruct\"\n",
    "converted_files = []\n",
    "for i in tqdm(range(len(input_files))):\n",
    "    converted_file_name = f\"{id_prefix}_sharegpt_shard{i}.jsonl\"\n",
    "    len_data = convert_to_sharegpt(input_files[i], converted_file_name, id_prefix, idx)\n",
    "    idx += len_data\n",
    "    converted_files.append(converted_file_name)\n",
    "\n",
    "# Concatenate all files\n",
    "output_file = f\"{id_prefix}_sharegpt.jsonl\"\n",
    "with open(output_file, 'w') as outfile:\n",
    "    for fname in tqdm(converted_files):\n",
    "        with open(fname) as infile:\n",
    "            for line in infile:\n",
    "                outfile.write(line)\n",
    "\n",
    "print(f\"Concatenated {len(converted_files)} files to {output_file}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 10%|█         | 1/10 [00:06<00:57,  6.39s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Converted 100000 entries to Phi-3-medium-128k-instruct_sharegpt_shard0.jsonl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 20%|██        | 2/10 [00:12<00:51,  6.47s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Converted 100000 entries to Phi-3-medium-128k-instruct_sharegpt_shard1.jsonl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 30%|███       | 3/10 [00:19<00:46,  6.70s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Converted 100000 entries to Phi-3-medium-128k-instruct_sharegpt_shard2.jsonl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 40%|████      | 4/10 [00:26<00:39,  6.64s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Converted 100000 entries to Phi-3-medium-128k-instruct_sharegpt_shard3.jsonl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 50%|█████     | 5/10 [00:32<00:32,  6.54s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Converted 100000 entries to Phi-3-medium-128k-instruct_sharegpt_shard4.jsonl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 60%|██████    | 6/10 [00:39<00:25,  6.47s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Converted 100000 entries to Phi-3-medium-128k-instruct_sharegpt_shard5.jsonl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 70%|███████   | 7/10 [00:46<00:20,  6.76s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Converted 100000 entries to Phi-3-medium-128k-instruct_sharegpt_shard6.jsonl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 80%|████████  | 8/10 [00:53<00:13,  6.82s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Converted 100000 entries to Phi-3-medium-128k-instruct_sharegpt_shard7.jsonl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 90%|█████████ | 9/10 [00:59<00:06,  6.69s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Converted 100000 entries to Phi-3-medium-128k-instruct_sharegpt_shard8.jsonl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 10/10 [01:07<00:00,  6.72s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Converted 100000 entries to Phi-3-medium-128k-instruct_sharegpt_shard9.jsonl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "100%|██████████| 10/10 [00:10<00:00,  1.02s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Concatenated 10 files to Phi-3-medium-128k-instruct_sharegpt.jsonl\n"
     ]
    }
   ],
   "source": [
    "input_files = [\n",
    "    \"Phi-3-medium-128k-instruct_topp1_temp1_1719007908/Magpie_Phi-3-medium-128k-instruct_100000_1719007908_ins_res_difficulty_quality_category_safety_reward_language.json\",\n",
    "    \"Phi-3-medium-128k-instruct_topp1_temp1_1719008013/Magpie_Phi-3-medium-128k-instruct_100000_1719008013_ins_res_difficulty_quality_category_safety_reward_language.json\",\n",
    "    \"Phi-3-medium-128k-instruct_topp1_temp1_1719008020/Magpie_Phi-3-medium-128k-instruct_100000_1719008020_ins_res_difficulty_quality_category_safety_reward_language.json\",\n",
    "    \"Phi-3-medium-128k-instruct_topp1_temp1.1_1719008124/Magpie_Phi-3-medium-128k-instruct_100000_1719008124_ins_res_difficulty_quality_category_safety_reward_language.json\",\n",
    "    \"Phi-3-medium-128k-instruct_topp1_temp1.1_1719046915/Magpie_Phi-3-medium-128k-instruct_100000_1719046915_ins_res_difficulty_quality_category_safety_reward_language.json\",\n",
    "    \"Phi-3-medium-128k-instruct_topp1_temp1.1_1719046924/Magpie_Phi-3-medium-128k-instruct_100000_1719046924_ins_res_difficulty_quality_category_safety_reward_language.json\",\n",
    "    \"Phi-3-medium-128k-instruct_topp0.99_temp1.2_1719046978/Magpie_Phi-3-medium-128k-instruct_100000_1719046978_ins_res_difficulty_quality_category_safety_reward_language.json\",\n",
    "    \"Phi-3-medium-128k-instruct_topp0.99_temp1.2_1719046987/Magpie_Phi-3-medium-128k-instruct_100000_1719046987_ins_res_difficulty_quality_category_safety_reward_language.json\",\n",
    "    \"Phi-3-medium-128k-instruct_topp0.99_temp1.2_1719100458/Magpie_Phi-3-medium-128k-instruct_100000_1719100458_ins_res_difficulty_quality_category_safety_reward_language.json\",\n",
    "    \"Phi-3-medium-128k-instruct_topp0.99_temp1.25_1719100868/Magpie_Phi-3-medium-128k-instruct_100000_1719100868_ins_res_difficulty_quality_category_safety_reward_language.json\"\n",
    "]\n",
    "\n",
    "# Convert each file to Axolotl format\n",
    "idx = 0\n",
    "id_prefix = \"Phi-3-medium-128k-instruct\"\n",
    "converted_files = []\n",
    "for i in tqdm(range(len(input_files))):\n",
    "    converted_file_name = f\"{id_prefix}_sharegpt_shard{i}.jsonl\"\n",
    "    len_data = convert_to_sharegpt(input_files[i], converted_file_name, id_prefix, idx)\n",
    "    idx += len_data\n",
    "    converted_files.append(converted_file_name)\n",
    "\n",
    "# Concatenate all files\n",
    "output_file = f\"{id_prefix}_sharegpt.jsonl\"\n",
    "with open(output_file, 'w') as outfile:\n",
    "    for fname in tqdm(converted_files):\n",
    "        with open(fname) as infile:\n",
    "            for line in infile:\n",
    "                outfile.write(line)\n",
    "\n",
    "print(f\"Concatenated {len(converted_files)} files to {output_file}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "datarev",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
